In [1]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.graph_objs as go
import plotly.express as px
In [2]:
#importing dataset
data=pd.read_csv('CustomersChurn.csv')
data
Out[2]:
CLIENTNUM Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count ... Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio Attrition_Flag
0 768805383 45 M 3 High School Married $60K - $80K Blue 39 5 ... 3 12691.0 777 11914.0 1.335 1144 42 1.625 0.061 Existing Customer
1 818770008 49 F 5 Graduate Single Less than $40K Blue 44 6 ... 2 8256.0 864 7392.0 1.541 1291 33 3.714 0.105 Existing Customer
2 713982108 51 M 3 Graduate Married $80K - $120K Blue 36 4 ... 0 3418.0 0 3418.0 2.594 1887 20 2.333 0.000 Existing Customer
3 769911858 40 F 4 High School NaN Less than $40K Blue 34 3 ... 1 3313.0 2517 796.0 1.405 1171 20 2.333 0.760 Existing Customer
4 709106358 40 M 3 Uneducated Married $60K - $80K Blue 21 5 ... 0 4716.0 0 4716.0 2.175 816 28 2.500 0.000 Existing Customer
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10122 772366833 50 M 2 Graduate Single $40K - $60K Blue 40 3 ... 3 4003.0 1851 2152.0 0.703 15476 117 0.857 0.462 Existing Customer
10123 710638233 41 M 2 NaN Divorced $40K - $60K Blue 25 4 ... 3 4277.0 2186 2091.0 0.804 8764 69 0.683 0.511 Attrited Customer
10124 716506083 44 F 1 High School Married Less than $40K Blue 36 5 ... 4 5409.0 0 5409.0 0.819 10291 60 0.818 0.000 Attrited Customer
10125 717406983 30 M 2 Graduate NaN $40K - $60K Blue 36 4 ... 3 5281.0 0 5281.0 0.535 8395 62 0.722 0.000 Attrited Customer
10126 714337233 43 F 2 Graduate Married Less than $40K Silver 25 6 ... 4 10388.0 1961 8427.0 0.703 10294 61 0.649 0.189 Attrited Customer

10127 rows × 21 columns

In [3]:
#missing values
data.isnull().sum()
Out[3]:
CLIENTNUM                      0
Customer_Age                   0
Gender                         0
Dependent_count                0
Education_Level             1519
Marital_Status               749
Income_Category                0
Card_Category                  0
Months_on_book                 0
Total_Relationship_Count       0
Months_Inactive_12_mon         0
Contacts_Count_12_mon          0
Credit_Limit                   0
Total_Revolving_Bal            0
Avg_Open_To_Buy                0
Total_Amt_Chng_Q4_Q1           0
Total_Trans_Amt                0
Total_Trans_Ct                 0
Total_Ct_Chng_Q4_Q1            0
Avg_Utilization_Ratio          0
Attrition_Flag                 0
dtype: int64
In [4]:
#Duplicates
data.duplicated().sum()
Out[4]:
0
In [5]:
gender_counts = data['Gender'].value_counts()
fig = px.pie(
    names=gender_counts.index, 
    values=gender_counts.values, 
    color_discrete_sequence=['skyblue', 'green'], 
    labels=['Male', 'Female'], 
    width=800, 
    height=350,  
)
fig.update_layout(title_text='Distribution of Gender', title_x=0.48,
    title_y=0.95
)

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60 ))

fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
In [6]:
#Proportion of Attrited vs Existing Customer

# Create the pie chart using Plotly Express
# Calculate the value counts of Attrition_Flag
attrition_counts = data['Attrition_Flag'].value_counts()

# Create the pie chart using Plotly Express
fig = px.pie(
    names=attrition_counts.index, 
    values=attrition_counts.values, 
    color_discrete_sequence=['skyblue', 'yellow'], 
    labels=['Attrited Customer', 'Existing Customer'], 
    width=800, 
    height=350 
)
fig.update_layout(title_text='Proportion of Attrited vs Existing Customer', title_x=0.44, title_y=0.9)

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'

# Show the plot
fig.show()
In [7]:
#Proportion of Attrited Customers by Marital status
 # Count occurrences of Attrition for each Education_Level
counts = data.groupby(['Marital_Status', 'Attrition_Flag']).size().unstack(fill_value=0)

# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
             color_discrete_map={'Attrited Customer': 'Gold', 'Existing Customer': 'lightyellow'},
             width=800, height=500, text_auto='.2s')

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Marital status', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Education Level')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'

# Show the plot
fig.show()
In [8]:
#Proportion of Attrited Customers by Education Level
 # Count occurrences of Attrition for each Education_Level
counts = data.groupby(['Education_Level', 'Attrition_Flag']).size().unstack(fill_value=0)

# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
             color_discrete_map={'Attrited Customer': 'gold', 'Existing Customer': 'lightskyblue'},
             width=800, height=500, text_auto='.2s')

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Education Level', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Education Level')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
In [9]:
#Proportion of Attrited Customers by Income_Category

# Assuming you have already calculated counts DataFrame using groupby
counts = data.groupby(['Income_Category', 'Attrition_Flag']).size().unstack(fill_value=0)

# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
             color_discrete_map={'Attrited Customer': 'darkblue', 'Existing Customer': 'skyblue'},
             width=800, height=500, text_auto='.2s')

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Income_Category', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Income Category')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
In [10]:
#Proportion by Educational level
# Calculate value counts of Education_Level column
education_counts = data.Education_Level.value_counts()

# Create the pie chart using Plotly Express
fig = px.pie(
    names=education_counts.index, 
    values=education_counts.values, 
    color_discrete_sequence=['skyblue', 'yellow'], 
    labels=['Attrited Customer', 'Existing Customer'], 
    width=800, 
    height=350 
)
fig.update_layout(title_text='Proportion by Educational level', title_x=0.44, title_y=0.95)

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'

# Show the plot
fig.show()
In [12]:
##Propotion Of Different Card Categories

Card_Category_counts = data.Card_Category.value_counts()

fig = px.pie(
    names=Card_Category_counts.index, 
    values=Card_Category_counts.values, 
    color_discrete_sequence=['skyblue', 'yellow','gold','red'], 
    labels=['Attrited Customer', 'Existing Customer'], 
    width=800, 
    height=350 
)
fig.update_layout(title_text='Propotion Of Different Card Categories', title_x=0.44, title_y=0.95)

# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'

# Show the plot
fig.show()
In [13]:
#Propotion Of Credit limit by Income category

# Create the bar chart using Plotly Express
fig=px.bar(data, x='Income_Category',y='Credit_Limit',width=800, height=500, text_auto='.2s',
           color_discrete_sequence=['red']
            )
          
# Update layout
fig.update_layout(
    margin=dict(t=50, b=50, l=60, r=60), template = 'plotly_dark', 
    title_text='Propotion Of Credit limit by Income category', 
    title_x=0.44, title_y=0.95 
)

fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Income Category')
fig.update_yaxes(visible=True, title='Credit Limit')
# Show the plot
fig.show()
In [14]:
#converting string to numerical value
data['Attrition_Flag']=data['Attrition_Flag'].map({'Existing Customer':0,'Attrited Customer':1})
data.head()
Out[14]:
CLIENTNUM Customer_Age Gender Dependent_count Education_Level Marital_Status Income_Category Card_Category Months_on_book Total_Relationship_Count ... Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio Attrition_Flag
0 768805383 45 M 3 High School Married $60K - $80K Blue 39 5 ... 3 12691.0 777 11914.0 1.335 1144 42 1.625 0.061 0
1 818770008 49 F 5 Graduate Single Less than $40K Blue 44 6 ... 2 8256.0 864 7392.0 1.541 1291 33 3.714 0.105 0
2 713982108 51 M 3 Graduate Married $80K - $120K Blue 36 4 ... 0 3418.0 0 3418.0 2.594 1887 20 2.333 0.000 0
3 769911858 40 F 4 High School NaN Less than $40K Blue 34 3 ... 1 3313.0 2517 796.0 1.405 1171 20 2.333 0.760 0
4 709106358 40 M 3 Uneducated Married $60K - $80K Blue 21 5 ... 0 4716.0 0 4716.0 2.175 816 28 2.500 0.000 0

5 rows × 21 columns

In [15]:
#dont need this colunm
X = data.drop(columns=['Gender','Education_Level','Income_Category','Marital_Status','Card_Category'], inplace=True)
y = data['Attrition_Flag']
In [16]:
data_array = np.array(data)
data
Out[16]:
CLIENTNUM Customer_Age Dependent_count Months_on_book Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio Attrition_Flag
0 768805383 45 3 39 5 1 3 12691.0 777 11914.0 1.335 1144 42 1.625 0.061 0
1 818770008 49 5 44 6 1 2 8256.0 864 7392.0 1.541 1291 33 3.714 0.105 0
2 713982108 51 3 36 4 1 0 3418.0 0 3418.0 2.594 1887 20 2.333 0.000 0
3 769911858 40 4 34 3 4 1 3313.0 2517 796.0 1.405 1171 20 2.333 0.760 0
4 709106358 40 3 21 5 1 0 4716.0 0 4716.0 2.175 816 28 2.500 0.000 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10122 772366833 50 2 40 3 2 3 4003.0 1851 2152.0 0.703 15476 117 0.857 0.462 0
10123 710638233 41 2 25 4 2 3 4277.0 2186 2091.0 0.804 8764 69 0.683 0.511 1
10124 716506083 44 1 36 5 3 4 5409.0 0 5409.0 0.819 10291 60 0.818 0.000 1
10125 717406983 30 2 36 4 3 3 5281.0 0 5281.0 0.535 8395 62 0.722 0.000 1
10126 714337233 43 2 25 6 2 4 10388.0 1961 8427.0 0.703 10294 61 0.649 0.189 1

10127 rows × 16 columns

In [17]:
X = data.iloc[:, : -1]
y = data.iloc[:, -1]
In [18]:
from sklearn.preprocessing import OrdinalEncoder
In [19]:
oe = OrdinalEncoder()
X = oe.fit_transform(X)
In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
In [21]:
#Features Scalling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train)
print(X_test)
[[0.67397531 0.47727273 0.4        ... 0.37096774 0.26055489 0.07276507]
 [0.75140741 0.63636364 0.2        ... 0.75       0.47527141 0.26611227]
 [0.60918519 0.34090909 0.8        ... 0.54032258 0.75271411 0.29209979]
 ...
 [0.5602963  0.77272727 0.2        ... 0.45967742 0.67792521 0.17047817]
 [0.56967901 0.56818182 0.6        ... 0.87903226 0.53920386 0.04158004]
 [0.71812346 0.61363636 0.2        ... 0.21774194 0.43667069 0.84199584]]
[[0.96958025 0.40909091 1.         ... 0.39516129 0.66827503 0.1008316 ]
 [0.36661728 0.40909091 0.6        ... 0.40322581 0.28829916 0.        ]
 [0.85264198 0.25       0.6        ... 0.45967742 0.24728589 0.78690229]
 ...
 [0.67930864 0.75       0.2        ... 0.65322581 0.58504222 0.33575884]
 [0.67595062 0.40909091 0.2        ... 0.47580645 0.31966224 0.07276507]
 [0.20493827 0.27272727 0.         ... 0.21774194 0.23763571 0.13513514]]
In [22]:
#applying model on the training set
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
In [23]:
#to fill the missing value
imputer= SimpleImputer(strategy='mean')
   
# Fit the model on the training data
imputer.fit(X_train, y_train)
Out[23]:
SimpleImputer()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer()
In [24]:
clf=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
clf.fit(X_train, y_train)
Out[24]:
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
In [25]:
#predicting result

pred=(clf.predict(scaler.transform([[768805383,45,3,39,5,1, 3, 12691, 777,11914, 1.335,1144,42,1.625,0.061]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will not churn
In [26]:
pred=(clf.predict(scaler.transform([[71174388,43,3,35,5,2,3,4026,0,4026,0.483,1237,32,0.6,0]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will churn
In [27]:
pred=(clf.predict(scaler.transform([[711791583,52,1,40,1,2,2,2317,0,2317,1.005,884,19,0.727,0]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will churn
In [28]:
y_pred = clf.predict(X_test)
y_pred
Out[28]:
array([0, 1, 0, ..., 0, 0, 0], dtype=int64)
In [28]:
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
 #Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(clf,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, y_pred)
[[2554   29]
 [ 118  338]]
Out[28]:
0.9516288252714709
In [29]:
print (classification_report(y_pred, y_test))
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      2672
           1       0.74      0.92      0.82       367

    accuracy                           0.95      3039
   macro avg       0.87      0.94      0.90      3039
weighted avg       0.96      0.95      0.95      3039

In [30]:
#applying SVC moddel
svc=SVC(kernel='rbf',random_state=0)
svc.fit(X_train, y_train)
pred_svc=svc.predict(X_test)
In [28]:
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, pred_svc)
 #Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(svc,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, pred_svc)
[[ 322  134]
 [  60 2523]]
Out[28]:
0.9361632115827575
In [31]:
print (classification_report(pred_svc, y_test))
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      2657
           1       0.71      0.84      0.77       382

    accuracy                           0.94      3039
   macro avg       0.84      0.90      0.87      3039
weighted avg       0.94      0.94      0.94      3039

In [32]:
#applying Adaboostclassifier moddel
abc=AdaBoostClassifier()
abc.fit(X_train, y_train)
pred_abc=abc.predict(X_test)
In [32]:
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, pred_abc)
 #Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(abc,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, pred_abc)
[[ 378   78]
 [  51 2532]]
Out[32]:
0.9575518262586377
In [33]:
print (classification_report(pred_abc, y_test))
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2610
           1       0.83      0.88      0.85       429

    accuracy                           0.96      3039
   macro avg       0.90      0.93      0.91      3039
weighted avg       0.96      0.96      0.96      3039

In [34]:
clf=RandomForestClassifier(random_state=42)
abc=AdaBoostClassifier(random_state=42,learning_rate=0.7)
svc=SVC(random_state=42,kernel='rbf')

f1_cross_val_scores = cross_val_score(clf,X_train, y_train,cv=5,scoring='f1')
ada_f1_cross_val_scores=cross_val_score(abc,X_train,y_train,cv=5,scoring='f1')
svm_f1_cross_val_scores=cross_val_score(svc, X_train,y_train,cv=5,scoring='f1')

fig = make_subplots(rows=3, cols=1,shared_xaxes=True,subplot_titles=('Random Forest Cross Val Scores',
                                                                     'Adaboost Cross Val Scores',
                                                                    'SVM Cross Val Scores'))

fig.add_trace(
    go.Scatter(x=list(range(0,len(f1_cross_val_scores))),y=f1_cross_val_scores,name='Random Forest'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(ada_f1_cross_val_scores))),y=ada_f1_cross_val_scores,name='Adaboost'),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(svm_f1_cross_val_scores))),y=svm_f1_cross_val_scores,name='SVM'),
    row=3, col=1
)

fig.update_layout(height=700, width=900, title_text="Different Model 5 Fold Cross Validation")
fig.update_yaxes(visible=True,title_text="F1 Score")
fig.update_xaxes(visible=True,title_text="Fold")

fig.show()
In [ ]:

In [ ]: